#rm(list = ls())
#packages
library(bayesrules)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(randomForest)
randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.
library(rpart)
library(tree)
library(pROC)
Type 'citation("pROC")' for a citation.
Attaching package: ‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
library(mgcv)
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
library(ISLR)
library(dplyr)
Attaching package: ‘dplyr’
The following object is masked from ‘package:nlme’:
collapse
The following object is masked from ‘package:randomForest’:
combine
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ──────────────────────────────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.3.6 ✔ purrr 0.3.4
✔ tibble 3.1.8 ✔ stringr 1.4.1
✔ tidyr 1.2.1 ✔ forcats 0.5.2
✔ readr 2.1.2 ── Conflicts ─────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::collapse() masks nlme::collapse()
✖ dplyr::combine() masks randomForest::combine()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ ggplot2::margin() masks randomForest::margin()
library(faraway)
Attaching package: ‘faraway’
The following object is masked from ‘package:rpart’:
solder
library(olsrr)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘olsrr’
The following object is masked from ‘package:faraway’:
hsb
The following object is masked from ‘package:datasets’:
rivers
library(caret)
Loading required package: lattice
Attaching package: ‘lattice’
The following object is masked from ‘package:faraway’:
melanoma
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
#Data Set
data1 <- mtcars
summary(data1)
mpg cyl disp hp drat wt
Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0 Min. :2.760 Min. :1.513
1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5 1st Qu.:3.080 1st Qu.:2.581
Median :19.20 Median :6.000 Median :196.3 Median :123.0 Median :3.695 Median :3.325
Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7 Mean :3.597 Mean :3.217
3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0 3rd Qu.:3.920 3rd Qu.:3.610
Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0 Max. :4.930 Max. :5.424
qsec vs am gear carb
Min. :14.50 Min. :0.0000 Min. :0.0000 Min. :3.000 Min. :1.000
1st Qu.:16.89 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
Median :17.71 Median :0.0000 Median :0.0000 Median :4.000 Median :2.000
Mean :17.85 Mean :0.4375 Mean :0.4062 Mean :3.688 Mean :2.812
3rd Qu.:18.90 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
Max. :22.90 Max. :1.0000 Max. :1.0000 Max. :5.000 Max. :8.000
head(data1)
pairs(data1)
# Conclusion
# Some of the predictors are linearly depedent based on the graph. (ex. mpg~drat, hp~wt)
cor(data1)
mpg cyl disp hp drat wt qsec vs
mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594 0.41868403 0.6640389
cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958 -0.59124207 -0.8108118
disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799 -0.43369788 -0.7104159
hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479 -0.70822339 -0.7230967
drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406 0.09120476 0.4402785
wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000 -0.17471588 -0.5549157
qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159 1.00000000 0.7445354
vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157 0.74453544 1.0000000
am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953 -0.22986086 0.1683451
gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870 -0.21268223 0.2060233
carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059 -0.65624923 -0.5696071
am gear carb
mpg 0.59983243 0.4802848 -0.55092507
cyl -0.52260705 -0.4926866 0.52698829
disp -0.59122704 -0.5555692 0.39497686
hp -0.24320426 -0.1257043 0.74981247
drat 0.71271113 0.6996101 -0.09078980
wt -0.69249526 -0.5832870 0.42760594
qsec -0.22986086 -0.2126822 -0.65624923
vs 0.16834512 0.2060233 -0.56960714
am 1.00000000 0.7940588 0.05753435
gear 0.79405876 1.0000000 0.27407284
carb 0.05753435 0.2740728 1.00000000
#Full Model
mr_model <- lm(mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb, data1)
summary(mr_model)
Call:
lm(formula = mpg ~ cyl + disp + hp + drat + wt + qsec + vs +
am + gear + carb, data = data1)
Residuals:
Min 1Q Median 3Q Max
-3.4506 -1.6044 -0.1196 1.2193 4.6271
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.30337 18.71788 0.657 0.5181
cyl -0.11144 1.04502 -0.107 0.9161
disp 0.01334 0.01786 0.747 0.4635
hp -0.02148 0.02177 -0.987 0.3350
drat 0.78711 1.63537 0.481 0.6353
wt -3.71530 1.89441 -1.961 0.0633 .
qsec 0.82104 0.73084 1.123 0.2739
vs 0.31776 2.10451 0.151 0.8814
am 2.52023 2.05665 1.225 0.2340
gear 0.65541 1.49326 0.439 0.6652
carb -0.19942 0.82875 -0.241 0.8122
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.65 on 21 degrees of freedom
Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
# Conclusion From the summary
# 1. Based on the t test, none of them are significant, which is potentially meaning that The Full Model is a bad choice for predicting mpg.
vif(mr_model)
cyl disp hp drat wt qsec vs am gear
15.373833 21.620241 9.832037 3.374620 15.164887 7.527958 4.965873 4.648487 5.357452
carb
7.908747
# 2. Some of them has multicollinearity issures
plot(mr_model)
ols_plot_added_variable(mr_model)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
(ols_regress(mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb, data1))
Model Summary
--------------------------------------------------------------
R 0.932 RMSE 2.650
R-Squared 0.869 Coef. Var 13.191
Adj. R-Squared 0.807 MSE 7.024
Pred R-Squared 0.654 MAE 1.723
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 978.553 10 97.855 13.932 0.0000
Residual 147.494 21 7.024
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
-----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
-----------------------------------------------------------------------------------------
(Intercept) 12.303 18.718 0.657 0.518 -26.623 51.229
cyl -0.111 1.045 -0.033 -0.107 0.916 -2.285 2.062
disp 0.013 0.018 0.274 0.747 0.463 -0.024 0.050
hp -0.021 0.022 -0.244 -0.987 0.335 -0.067 0.024
drat 0.787 1.635 0.070 0.481 0.635 -2.614 4.188
wt -3.715 1.894 -0.603 -1.961 0.063 -7.655 0.224
qsec 0.821 0.731 0.243 1.123 0.274 -0.699 2.341
vs 0.318 2.105 0.027 0.151 0.881 -4.059 4.694
am 2.520 2.057 0.209 1.225 0.234 -1.757 6.797
gear 0.655 1.493 0.080 0.439 0.665 -2.450 3.761
carb -0.199 0.829 -0.053 -0.241 0.812 -1.923 1.524
-----------------------------------------------------------------------------------------
Conclusion: 1. The QQ Plots show that the Full Model residuals are Normally Distributed. 2. There are 4 out of 32 outliers based on cooks D, which is kind of high proportion.
#Generate QQ Plot
qqnorm(residuals(mr_model),ylab="Residuals",main="Q-Q plot")
qqline(residuals(mr_model))
#Cook's D
ols_plot_cooksd_bar(mr_model)
ols_plot_cooksd_chart(mr_model)
#dfbetas panel
ols_plot_dfbetas(mr_model)
#dffits plot
ols_plot_dffits(mr_model)
#Studentized residuals
ols_plot_resid_stud(mr_model)
#Standardized residuals
ols_plot_resid_stand(mr_model)
#Studentized Residuals vs Leverage Plot
ols_plot_resid_lev(mr_model)
#Deleted Studentized Residual vs Fitted Values Plot
ols_plot_resid_stud_fit(mr_model)
#Hadi Plot
ols_plot_hadi(mr_model)
#Potential Residual Plot
ols_plot_resid_pot(mr_model)
Conclusion: Backward Elimination gives that the model mpg ~ disp + hp + wt + qsec + am is the best subset model.
backward.reg <- ols_step_backward_p(mr_model,details=TRUE)
Backward Elimination Method
---------------------------
Candidate Terms:
1 . cyl
2 . disp
3 . hp
4 . drat
5 . wt
6 . qsec
7 . vs
8 . am
9 . gear
10 . carb
We are eliminating variables based on p value...
x cyl
Backward Elimination: Step 1
Variable cyl Removed
Model Summary
--------------------------------------------------------------
R 0.932 RMSE 2.590
R-Squared 0.869 Coef. Var 12.891
Adj. R-Squared 0.815 MSE 6.708
Pred R-Squared 0.704 MAE 1.720
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 978.473 9 108.719 16.208 0.0000
Residual 147.574 22 6.708
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
-----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
-----------------------------------------------------------------------------------------
(Intercept) 10.960 13.530 0.810 0.427 -17.100 39.020
disp 0.013 0.017 0.264 0.763 0.454 -0.022 0.048
hp -0.022 0.021 -0.249 -1.048 0.306 -0.065 0.021
drat 0.835 1.536 0.074 0.544 0.592 -2.351 4.021
wt -3.693 1.840 -0.599 -2.007 0.057 -7.507 0.122
qsec 0.842 0.687 0.250 1.227 0.233 -0.582 2.267
vs 0.390 1.948 0.033 0.200 0.843 -3.650 4.430
am 2.577 1.940 0.213 1.328 0.198 -1.447 6.601
gear 0.712 1.366 0.087 0.521 0.608 -2.121 3.544
carb -0.220 0.789 -0.059 -0.278 0.783 -1.855 1.416
-----------------------------------------------------------------------------------------
x vs
Backward Elimination: Step 2
Variable vs Removed
Model Summary
--------------------------------------------------------------
R 0.932 RMSE 2.535
R-Squared 0.869 Coef. Var 12.620
Adj. R-Squared 0.823 MSE 6.428
Pred R-Squared 0.732 MAE 1.741
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 978.204 8 122.276 19.022 0.0000
Residual 147.843 23 6.428
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
-----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
-----------------------------------------------------------------------------------------
(Intercept) 9.768 11.892 0.821 0.420 -14.833 34.369
disp 0.012 0.016 0.250 0.753 0.459 -0.021 0.045
hp -0.021 0.020 -0.238 -1.051 0.304 -0.062 0.020
drat 0.875 1.491 0.078 0.587 0.563 -2.210 3.960
wt -3.712 1.798 -0.603 -2.064 0.050 -7.432 0.009
qsec 0.911 0.583 0.270 1.562 0.132 -0.295 2.117
am 2.524 1.881 0.209 1.342 0.193 -1.368 6.416
gear 0.760 1.316 0.093 0.577 0.569 -1.962 3.482
carb -0.248 0.759 -0.066 -0.327 0.747 -1.819 1.323
-----------------------------------------------------------------------------------------
x carb
Backward Elimination: Step 3
Variable carb Removed
Model Summary
--------------------------------------------------------------
R 0.932 RMSE 2.488
R-Squared 0.868 Coef. Var 12.382
Adj. R-Squared 0.830 MSE 6.189
Pred R-Squared 0.762 MAE 1.743
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 977.519 7 139.646 22.565 0.0000
Residual 148.528 24 6.189
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
-----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
-----------------------------------------------------------------------------------------
(Intercept) 9.198 11.542 0.797 0.433 -14.624 33.020
disp 0.016 0.012 0.319 1.278 0.213 -0.010 0.041
hp -0.025 0.016 -0.281 -1.548 0.135 -0.058 0.008
drat 0.810 1.450 0.072 0.559 0.582 -2.183 3.803
wt -4.131 1.236 -0.671 -3.342 0.003 -6.681 -1.580
qsec 1.010 0.489 0.299 2.066 0.050 0.001 2.019
am 2.590 1.835 0.214 1.411 0.171 -1.198 6.378
gear 0.606 1.206 0.074 0.503 0.620 -1.883 3.095
-----------------------------------------------------------------------------------------
x gear
Backward Elimination: Step 4
Variable gear Removed
Model Summary
--------------------------------------------------------------
R 0.931 RMSE 2.450
R-Squared 0.867 Coef. Var 12.196
Adj. R-Squared 0.835 MSE 6.004
Pred R-Squared 0.785 MAE 1.769
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 975.954 6 162.659 27.093 0.0000
Residual 150.093 25 6.004
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
-----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
-----------------------------------------------------------------------------------------
(Intercept) 10.711 10.975 0.976 0.338 -11.894 33.315
disp 0.013 0.011 0.269 1.193 0.244 -0.010 0.036
hp -0.022 0.015 -0.248 -1.488 0.149 -0.052 0.008
drat 1.021 1.367 0.091 0.746 0.462 -1.796 3.837
wt -4.045 1.206 -0.657 -3.355 0.003 -6.527 -1.562
qsec 0.991 0.480 0.294 2.064 0.050 0.002 1.979
am 2.985 1.634 0.247 1.827 0.080 -0.380 6.350
-----------------------------------------------------------------------------------------
x drat
Backward Elimination: Step 5
Variable drat Removed
Model Summary
--------------------------------------------------------------
R 0.929 RMSE 2.429
R-Squared 0.864 Coef. Var 12.092
Adj. R-Squared 0.838 MSE 5.901
Pred R-Squared 0.798 MAE 1.815
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 972.609 5 194.522 32.962 0.0000
Residual 153.438 26 5.901
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 14.362 9.741 1.474 0.152 -5.661 34.384
disp 0.011 0.011 0.231 1.060 0.299 -0.011 0.033
hp -0.021 0.015 -0.241 -1.460 0.156 -0.051 0.009
wt -4.084 1.194 -0.663 -3.420 0.002 -6.539 -1.630
qsec 1.007 0.475 0.299 2.118 0.044 0.030 1.984
am 3.470 1.486 0.287 2.336 0.027 0.416 6.525
----------------------------------------------------------------------------------------
No more variables satisfy the condition of p value = 0.3
Variables Removed:
x cyl
x vs
x carb
x gear
x drat
Final Model Output
------------------
Model Summary
--------------------------------------------------------------
R 0.929 RMSE 2.429
R-Squared 0.864 Coef. Var 12.092
Adj. R-Squared 0.838 MSE 5.901
Pred R-Squared 0.798 MAE 1.815
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 972.609 5 194.522 32.962 0.0000
Residual 153.438 26 5.901
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 14.362 9.741 1.474 0.152 -5.661 34.384
disp 0.011 0.011 0.231 1.060 0.299 -0.011 0.033
hp -0.021 0.015 -0.241 -1.460 0.156 -0.051 0.009
wt -4.084 1.194 -0.663 -3.420 0.002 -6.539 -1.630
qsec 1.007 0.475 0.299 2.118 0.044 0.030 1.984
am 3.470 1.486 0.287 2.336 0.027 0.416 6.525
----------------------------------------------------------------------------------------
Conclusion: Backward Elimination gives that the model mpg ~ wt + cyl + hp is the best subset model.
forward.reg <- ols_step_forward_p(mr_model,details=TRUE)
Forward Selection Method
---------------------------
Candidate Terms:
1. cyl
2. disp
3. hp
4. drat
5. wt
6. qsec
7. vs
8. am
9. gear
10. carb
We are selecting variables based on p value...
Forward Selection: Step 1
+ wt
Model Summary
--------------------------------------------------------------
R 0.868 RMSE 3.046
R-Squared 0.753 Coef. Var 15.161
Adj. R-Squared 0.745 MSE 9.277
Pred R-Squared 0.709 MAE 2.341
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 847.725 1 847.725 91.375 0.0000
Residual 278.322 30 9.277
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 37.285 1.878 19.858 0.000 33.450 41.120
wt -5.344 0.559 -0.868 -9.559 0.000 -6.486 -4.203
----------------------------------------------------------------------------------------
Forward Selection: Step 2
+ cyl
Model Summary
--------------------------------------------------------------
R 0.911 RMSE 2.568
R-Squared 0.830 Coef. Var 12.780
Adj. R-Squared 0.819 MSE 6.592
Pred R-Squared 0.790 MAE 1.921
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 934.875 2 467.438 70.908 0.0000
Residual 191.172 29 6.592
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 39.686 1.715 23.141 0.000 36.179 43.194
wt -3.191 0.757 -0.518 -4.216 0.000 -4.739 -1.643
cyl -1.508 0.415 -0.447 -3.636 0.001 -2.356 -0.660
----------------------------------------------------------------------------------------
Forward Selection: Step 3
+ hp
Model Summary
--------------------------------------------------------------
R 0.918 RMSE 2.512
R-Squared 0.843 Coef. Var 12.501
Adj. R-Squared 0.826 MSE 6.308
Pred R-Squared 0.796 MAE 1.845
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 949.427 3 316.476 50.171 0.0000
Residual 176.621 28 6.308
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 38.752 1.787 21.687 0.000 35.092 42.412
wt -3.167 0.741 -0.514 -4.276 0.000 -4.684 -1.650
cyl -0.942 0.551 -0.279 -1.709 0.098 -2.070 0.187
hp -0.018 0.012 -0.205 -1.519 0.140 -0.042 0.006
----------------------------------------------------------------------------------------
No more variables to be added.
Variables Entered:
+ wt
+ cyl
+ hp
Final Model Output
------------------
Model Summary
--------------------------------------------------------------
R 0.918 RMSE 2.512
R-Squared 0.843 Coef. Var 12.501
Adj. R-Squared 0.826 MSE 6.308
Pred R-Squared 0.796 MAE 1.845
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 949.427 3 316.476 50.171 0.0000
Residual 176.621 28 6.308
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 38.752 1.787 21.687 0.000 35.092 42.412
wt -3.167 0.741 -0.514 -4.276 0.000 -4.684 -1.650
cyl -0.942 0.551 -0.279 -1.709 0.098 -2.070 0.187
hp -0.018 0.012 -0.205 -1.519 0.140 -0.042 0.006
----------------------------------------------------------------------------------------
## 4. Forward Elimination Conclusion: Backward Elimination
gives that the model mpg ~ wt + cyl + hp is the best subset model.
forward.reg <- ols_step_both_p(mr_model,details=TRUE)
Stepwise Selection Method
---------------------------
Candidate Terms:
1. cyl
2. disp
3. hp
4. drat
5. wt
6. qsec
7. vs
8. am
9. gear
10. carb
We are selecting variables based on p value...
Stepwise Selection: Step 1
+ wt
Model Summary
--------------------------------------------------------------
R 0.868 RMSE 3.046
R-Squared 0.753 Coef. Var 15.161
Adj. R-Squared 0.745 MSE 9.277
Pred R-Squared 0.709 MAE 2.341
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 847.725 1 847.725 91.375 0.0000
Residual 278.322 30 9.277
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 37.285 1.878 19.858 0.000 33.450 41.120
wt -5.344 0.559 -0.868 -9.559 0.000 -6.486 -4.203
----------------------------------------------------------------------------------------
Stepwise Selection: Step 2
+ cyl
Model Summary
--------------------------------------------------------------
R 0.911 RMSE 2.568
R-Squared 0.830 Coef. Var 12.780
Adj. R-Squared 0.819 MSE 6.592
Pred R-Squared 0.790 MAE 1.921
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 934.875 2 467.438 70.908 0.0000
Residual 191.172 29 6.592
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 39.686 1.715 23.141 0.000 36.179 43.194
wt -3.191 0.757 -0.518 -4.216 0.000 -4.739 -1.643
cyl -1.508 0.415 -0.447 -3.636 0.001 -2.356 -0.660
----------------------------------------------------------------------------------------
Model Summary
--------------------------------------------------------------
R 0.911 RMSE 2.568
R-Squared 0.830 Coef. Var 12.780
Adj. R-Squared 0.819 MSE 6.592
Pred R-Squared 0.790 MAE 1.921
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 934.875 2 467.438 70.908 0.0000
Residual 191.172 29 6.592
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 39.686 1.715 23.141 0.000 36.179 43.194
wt -3.191 0.757 -0.518 -4.216 0.000 -4.739 -1.643
cyl -1.508 0.415 -0.447 -3.636 0.001 -2.356 -0.660
----------------------------------------------------------------------------------------
No more variables to be added/removed.
Final Model Output
------------------
Model Summary
--------------------------------------------------------------
R 0.911 RMSE 2.568
R-Squared 0.830 Coef. Var 12.780
Adj. R-Squared 0.819 MSE 6.592
Pred R-Squared 0.790 MAE 1.921
--------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
ANOVA
--------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------
Regression 934.875 2 467.438 70.908 0.0000
Residual 191.172 29 6.592
Total 1126.047 31
--------------------------------------------------------------------
Parameter Estimates
----------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
----------------------------------------------------------------------------------------
(Intercept) 39.686 1.715 23.141 0.000 36.179 43.194
wt -3.191 0.757 -0.518 -4.216 0.000 -4.739 -1.643
cyl -1.508 0.415 -0.447 -3.636 0.001 -2.356 -0.660
----------------------------------------------------------------------------------------
f_model <- lm(mpg ~ wt + cyl + hp, data1)
summary(f_model)
Call:
lm(formula = mpg ~ wt + cyl + hp, data = data1)
Residuals:
Min 1Q Median 3Q Max
-3.9290 -1.5598 -0.5311 1.1850 5.8986
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 38.75179 1.78686 21.687 < 2e-16 ***
wt -3.16697 0.74058 -4.276 0.000199 ***
cyl -0.94162 0.55092 -1.709 0.098480 .
hp -0.01804 0.01188 -1.519 0.140015
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.512 on 28 degrees of freedom
Multiple R-squared: 0.8431, Adjusted R-squared: 0.8263
F-statistic: 50.17 on 3 and 28 DF, p-value: 2.184e-11
#backward model
b_model <- lm(mpg ~ disp + hp + wt + qsec + am, data1)
summary(b_model)
Conclusion: Stepwise Elimination gives that the model mpg ~ wt + cyl is the best subset model.
#backward model
bi_model <- lm(mpg ~ wt + cyl, data1)
summary(bi_model)
Call:
lm(formula = mpg ~ wt + cyl, data = data1)
Residuals:
Min 1Q Median 3Q Max
-4.2893 -1.5512 -0.4684 1.5743 6.1004
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 39.6863 1.7150 23.141 < 2e-16 ***
wt -3.1910 0.7569 -4.216 0.000222 ***
cyl -1.5078 0.4147 -3.636 0.001064 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.568 on 29 degrees of freedom
Multiple R-squared: 0.8302, Adjusted R-squared: 0.8185
F-statistic: 70.91 on 2 and 29 DF, p-value: 6.809e-12
all_p <- ols_step_all_possible(mr_model)
all_p
plot(all_p)
Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
b.subset <- ols_step_best_subset(mr_model)
b.subset
Best Subsets Regression
-------------------------------------------------------
Model Index Predictors
-------------------------------------------------------
1 wt
2 cyl wt
3 wt qsec am
4 hp wt qsec am
5 disp hp wt qsec am
6 disp hp drat wt qsec am
7 disp hp drat wt qsec am gear
8 disp hp drat wt qsec am gear carb
9 disp hp drat wt qsec vs am gear carb
10 cyl disp hp drat wt qsec vs am gear carb
-------------------------------------------------------
Subsets Regression Summary
---------------------------------------------------------------------------------------------------------------------------------
Adj. Pred
Model R-Square R-Square R-Square C(p) AIC SBIC SBC MSEP FPE HSP APC
---------------------------------------------------------------------------------------------------------------------------------
1 0.7528 0.7446 0.7087 11.6270 166.0294 74.3734 170.4266 296.9167 9.8572 0.3199 0.2801
2 0.8302 0.8185 0.7904 1.2187 156.0101 66.1903 161.8730 211.2280 7.2101 0.2354 0.2049
3 0.8497 0.8336 0.7946 0.1026 154.1194 65.7138 161.4481 193.9735 6.8017 0.2239 0.1933
4 0.8579 0.8368 0.8021 0.7900 154.3274 67.2299 163.1218 190.4637 6.8547 0.2280 0.1948
5 0.8637 0.8375 0.7984 1.8462 154.9740 69.3073 165.2341 189.8793 7.0080 0.2361 0.1992
6 0.8667 0.8347 0.7855 3.3700 156.2687 71.9258 167.9946 193.4796 7.3170 0.2502 0.2079
7 0.8681 0.8296 0.7619 5.1472 157.9333 74.8058 171.1250 199.7867 7.7358 0.2691 0.2198
8 0.8687 0.8230 0.7316 7.0496 159.7853 77.7959 174.4427 207.9040 8.2358 0.2922 0.2340
9 0.8689 0.8153 0.7035 9.0114 161.7271 80.8277 177.8502 217.4086 8.8041 0.3194 0.2502
10 0.8690 0.8066 0.6538 11.0000 163.7098 83.8728 181.2986 228.1554 9.4379 0.3512 0.2682
---------------------------------------------------------------------------------------------------------------------------------
AIC: Akaike Information Criteria
SBIC: Sawa's Bayesian Information Criteria
SBC: Schwarz Bayesian Criteria
MSEP: Estimated error of prediction, assuming multivariate normality
FPE: Final Prediction Error
HSP: Hocking's Sp
APC: Amemiya Prediction Criteria
plot(b.subset)
Conclusion: The Model Selection Criteria based on All Possible Regression gives that mpg~wt+qsec+am is the best model for mtcars
all_p$predictors[all_p$rsquare == max(all_p$rsquare)]
[1] "cyl disp hp drat wt qsec vs am gear carb"
all_p$predictors[all_p$adjr == max(all_p$adjr)]
[1] "disp hp wt qsec am"
all_p$predictors[all_p$cp == min(all_p$cp)]
[1] "wt qsec am"
all_p$predictors[all_p$aic == min(all_p$aic)]
[1] "wt qsec am"
all_p$predictors[all_p$sbic == min(all_p$sbic)]
[1] "wt qsec am"
all_p$predictors[all_p$sbc == min(all_p$sbc)]
[1] "wt qsec am"
Conclusion: The Model Selection Criteria based on All Best Subset Regression gives that mpg~wt+qsec+am is the best model for mtcars
b.subset$predictors[b.subset$rsquare == max(b.subset$rsquare)]
[1] "cyl disp hp drat wt qsec vs am gear carb"
b.subset$predictors[b.subset$adjr == max(b.subset$adjr)]
[1] "disp hp wt qsec am"
b.subset$predictors[b.subset$cp == min(b.subset$cp)]
[1] "wt qsec am"
b.subset$predictors[b.subset$aic == min(b.subset$aic)]
[1] "wt qsec am"
b.subset$predictors[b.subset$sbic == min(b.subset$sbic)]
[1] "wt qsec am"
b.subset$predictors[b.subset$sbc == min(b.subset$sbc)]
[1] "wt qsec am"
Current we have the options: 1. wt + qsec + am 2. wt + cyl 3. disp + hp + wt + qsec + am 4. wt + cyl + hp
train.control1 <- trainControl(method = "cv", number = 5)
train.control2 <- trainControl(method = "repeatedcv", number = 5, repeats = 100)
Conclusion: It is still not explicit to say which model should be chose.
#How about n-1 fold?
option1 <- train(mpg ~ wt + qsec + am, data = data1, method = "lm", trControl = train.control1)
option2 <- train(mpg ~ wt + cyl, data = data1, method = "lm", trControl = train.control1)
option3 <- train(mpg ~ disp + hp + wt + qsec + am, data = data1, method = "lm", trControl = train.control1)
option4 <- train(mpg ~ wt + cyl + hp, data = data1, method = "lm", trControl = train.control1)
print(option1)
Linear Regression
32 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 27, 25, 26, 25, 25
Resampling results:
RMSE Rsquared MAE
2.369457 0.8611764 1.982344
Tuning parameter 'intercept' was held constant at a value of TRUE
print(option2)
Linear Regression
32 samples
2 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 25, 25, 25, 27, 26
Resampling results:
RMSE Rsquared MAE
2.467026 0.8398594 2.007533
Tuning parameter 'intercept' was held constant at a value of TRUE
print(option3)
Linear Regression
32 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 24, 26, 26, 25, 27
Resampling results:
RMSE Rsquared MAE
2.665735 0.8697989 2.170045
Tuning parameter 'intercept' was held constant at a value of TRUE
print(option4)
Linear Regression
32 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 25, 26, 26, 26, 25
Resampling results:
RMSE Rsquared MAE
2.419125 0.8294988 2.082439
Tuning parameter 'intercept' was held constant at a value of TRUE
tot <- 1
count <- data.frame(rmse_count = c(0,0,0,0), rsq_count = c(0,0,0,0), mae_count = c(0,0,0,0), total_num = c(tot,tot,tot,tot))
rmse_step <- data.frame(ind = c(1,2,3,4), rmse = c(option1$results$RMSE,option2$results$RMSE,option3$results$RMSE,option4$results$RMSE))
ind_max1 <- rmse_step$ind[rmse_step$rmse == min(rmse_step$rmse)]
count$rmse_count[ind_max1] <- count$rmse_count[ind_max1] + 1
rsq_step <- data.frame(ind = c(1,2,3,4), rsq = c(option1$results$Rsquared,option2$results$Rsquared,option3$results$Rsquared,option4$results$Rsquared))
ind_max2 <- rsq_step$ind[rsq_step$rsq == max(rsq_step$rsq)]
count$rsq_count[ind_max2] <- count$rsq_count[ind_max2] + 1
mae_step <- data.frame(ind = c(1,2,3,4), mae = c(option1$results$MAE,option2$results$MAE,option3$results$MAE,option4$results$MAE))
ind_max3 <- mae_step$ind[mae_step$mae == min(mae_step$mae)]
count$mae_count[ind_max3] <- count$mae_count[ind_max3] + 1
count
Conclusion: We have to choose 4th Model.
#How about n-1 fold?
option1 <- train(mpg ~ wt + qsec + am, data = data1, method = "lm", trControl = train.control2)
option2 <- train(mpg ~ wt + cyl, data = data1, method = "lm", trControl = train.control2)
option3 <- train(mpg ~ disp + hp + wt + qsec + am, data = data1, method = "lm", trControl = train.control2)
option4 <- train(mpg ~ wt + cyl + hp, data = data1, method = "lm", trControl = train.control2)
print(option1)
Linear Regression
32 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 100 times)
Summary of sample sizes: 27, 25, 25, 25, 26, 26, ...
Resampling results:
RMSE Rsquared MAE
2.616779 0.8461208 2.226053
Tuning parameter 'intercept' was held constant at a value of TRUE
print(option2)
Linear Regression
32 samples
2 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 100 times)
Summary of sample sizes: 27, 24, 25, 26, 26, 26, ...
Resampling results:
RMSE Rsquared MAE
2.574947 0.8618947 2.099273
Tuning parameter 'intercept' was held constant at a value of TRUE
print(option3)
Linear Regression
32 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 100 times)
Summary of sample sizes: 27, 25, 27, 24, 25, 25, ...
Resampling results:
RMSE Rsquared MAE
2.600129 0.8491927 2.222816
Tuning parameter 'intercept' was held constant at a value of TRUE
print(option4)
Linear Regression
32 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 100 times)
Summary of sample sizes: 27, 25, 24, 26, 26, 27, ...
Resampling results:
RMSE Rsquared MAE
2.553296 0.8657582 2.081244
Tuning parameter 'intercept' was held constant at a value of TRUE
tot <- 1
count <- data.frame(rmse_count = c(0,0,0,0), rsq_count = c(0,0,0,0), mae_count = c(0,0,0,0), total_num = c(tot,tot,tot,tot))
rmse_step <- data.frame(ind = c(1,2,3,4), rmse = c(option1$results$RMSE,option2$results$RMSE,option3$results$RMSE,option4$results$RMSE))
ind_max1 <- rmse_step$ind[rmse_step$rmse == min(rmse_step$rmse)]
count$rmse_count[ind_max1] <- count$rmse_count[ind_max1] + 1
rsq_step <- data.frame(ind = c(1,2,3,4), rsq = c(option1$results$Rsquared,option2$results$Rsquared,option3$results$Rsquared,option4$results$Rsquared))
ind_max2 <- rsq_step$ind[rsq_step$rsq == max(rsq_step$rsq)]
count$rsq_count[ind_max2] <- count$rsq_count[ind_max2] + 1
mae_step <- data.frame(ind = c(1,2,3,4), mae = c(option1$results$MAE,option2$results$MAE,option3$results$MAE,option4$results$MAE))
ind_max3 <- mae_step$ind[mae_step$mae == min(mae_step$mae)]
count$mae_count[ind_max3] <- count$mae_count[ind_max3] + 1
count